Purpose: determine in what extent the current data can accurately describe correlations, underlying factors on the score. Especially concerning the 'before' groups: are there underlying groups explaining the discrepancies in score? Are those groups tied to certain questions?
In [ ]:
%run "../Functions/1. Google form analysis.ipynb"
Purpose: find out which questions have the more weight in the computation of the score.
Other leads: LDA, ANOVA.
Source for PCA: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html
In [ ]:
binarized = getAllBinarized()
In [ ]:
score = np.dot(binarized,np.ones(len(binarized.columns)))
In [ ]:
dimensions = binarized.shape[1]
dimensions
In [ ]:
binarized['class'] = 'default'
In [ ]:
# split data table into data X and class labels y
X = binarized.iloc[:,0:dimensions].values
y = binarized.iloc[:,dimensions].values
In [ ]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)
In [ ]:
mean_vec = np.mean(X_std, axis=0)
cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
print('Covariance matrix \n%s' %cov_mat)
In [ ]:
print('NumPy covariance matrix: \n%s' %np.cov(X_std.T))
In [ ]:
cov_mat = np.cov(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
#print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)
In [ ]:
cor_mat1 = np.corrcoef(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cor_mat1)
#print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)
In [ ]:
u,s,v = np.linalg.svd(X_std.T)
In [ ]:
s
In [ ]:
for ev in eig_vecs:
np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev))
print('Everything ok!')
In [ ]:
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort()
eig_pairs.reverse()
# Visually confirm that the list is correctly sorted by decreasing eigenvalues
print('Eigenvalues in descending order:')
for i in eig_pairs:
print(i[0])
In [ ]:
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
with plt.style.context('seaborn-whitegrid'):
plt.figure(figsize=(6, 4))
plt.bar(range(dimensions), var_exp, alpha=0.5, align='center',
label='individual explained variance')
plt.step(range(dimensions), cum_var_exp, where='mid',
label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()
In [ ]:
var_exp[:5]
In [ ]:
cum_var_exp[:5]
In [ ]:
matrix_w = np.hstack((eig_pairs[0][1].reshape(dimensions,1),
eig_pairs[1][1].reshape(dimensions,1)))
print('Matrix W:\n', matrix_w)
In [ ]:
gform.columns
In [ ]:
colors = ('blue','red','green','magenta','cyan','purple','yellow','black','white')
len(colors)
In [ ]:
Y = X_std.dot(matrix_w)
In [ ]:
with plt.style.context('seaborn-whitegrid'):
plt.figure(figsize=(6, 4))
ax = plt.subplot(111)
plt.scatter(Y[:, 0], Y[:, 1])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title("base PCA")
plt.show()
import mca
X = binarized.iloc[:,0:dimensions].values y = binarized.iloc[:,dimensions].values
X_std.shape
xstddf = pd.DataFrame(X_std)
Y2 = mca.MCA(xstddf, ncols=dimensions)
with plt.style.context('seaborn-whitegrid'): plt.figure(figsize=(6, 4)) ax = plt.subplot(111) plt.scatter(Y2[:, 0], Y2[:, 1]) plt.xlabel('Principal Component 1') plt.ylabel('Principal Component 2') plt.title("base MCA") plt.show()
In [ ]:
# classNames is a tuple
def classifyAndPlot(classNames, classes, title = '', rainbow = False):
defaultClassName = ''
sampleSize = 0
for classIndex in range(0, len(classes)):
sampleSize += len(classes[classIndex])
if(sampleSize < gform.shape[0]):
if(len(classNames) == len(classes) + 1):
defaultClassName = classNames[-1]
else:
defaultClassName = 'other'
classNames.append(defaultClassName)
for labelIndex in binarized.index:
i = int(labelIndex[len('corrections'):])
isUserSet = False
for classIndex in range(0, len(classes)):
if(gform.iloc[i][localplayerguidkey] in classes[classIndex].values):
binarized.loc[labelIndex,'class'] = classNames[classIndex]
isUserSet = True
if not isUserSet:
if not (defaultClassName in classNames):
print("unexpected error: check the exhaustiveness of the provided classes")
binarized.loc[labelIndex,'class'] = defaultClassName
y = binarized.iloc[:,dimensions].values
with plt.style.context('seaborn-whitegrid'):
plt.figure(figsize=(6, 4))
ax = plt.subplot(111)
colors = ('blue','red','green','magenta','cyan','purple','yellow','black','white')
if (rainbow or len(classNames) > len(colors)):
colors = plt.cm.rainbow(np.linspace(1, 0, len(classNames)))
colors = colors[:len(classNames)]
for lab, col in zip(classNames,colors):
plt.scatter(Y[y==lab, 0],
Y[y==lab, 1],
label=lab,
c=col)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
# source https://stackoverflow.com/questions/4700614/how-to-put-the-legend-out-of-the-plot
# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
if(len(title) > 0):
plt.title(title)
plt.show()
In [ ]:
answered = binarized[binarized['Guess: the bacterium would glow yellow...'] == 1]
indices = answered.index.map(lambda label: int(label[len('corrections'):]))
surveys = gform.iloc[indices][localplayerguidkey]
#classifyAndPlot(['guessed', 'did not'], [surveys])
title = 'test title'
rainbow = True
alreadyDefaultClassName = True
classNames = ['guessed', 'did not']
classes = [surveys]
# classNames is a tuple
#def classifyAndPlot(classNames, classes, title = '', rainbow = False):
defaultClassName = ''
sampleSize = 0
for classIndex in range(0, len(classes)):
sampleSize += len(classes[classIndex])
if(sampleSize < gform.shape[0]):
if(len(classNames) == len(classes) + 1):
defaultClassName = classNames[-1]
else:
defaultClassName = 'other'
classNames.append(defaultClassName)
for labelIndex in binarized.index:
i = int(labelIndex[len('corrections'):])
isUserSet = False
for classIndex in range(0, len(classes)):
if(gform.iloc[i][localplayerguidkey] in classes[classIndex].values):
binarized.loc[labelIndex,'class'] = classNames[classIndex]
isUserSet = True
if not isUserSet:
if not (defaultClassName in classNames):
print("unexpected error: check the exhaustiveness of the provided classes")
binarized.loc[labelIndex,'class'] = defaultClassName
y = binarized.iloc[:,dimensions].values
with plt.style.context('seaborn-whitegrid'):
plt.figure(figsize=(6, 4))
ax = plt.subplot(111)
colors = ('blue','red','green','magenta','cyan','purple','yellow','black','white')
if (rainbow or len(classNames) > len(colors)):
colors = plt.cm.rainbow(np.linspace(1, 0, len(classNames)))
colors = colors[:len(classNames)]
for lab, col in zip(classNames,colors):
plt.scatter(Y[y==lab, 0],
Y[y==lab, 1],
label=lab,
c=col)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
# source https://stackoverflow.com/questions/4700614/how-to-put-the-legend-out-of-the-plot
# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
if(len(title) > 0):
plt.title(title)
plt.show()
In [ ]:
answered = binarized[binarized['Guess: the bacterium would glow yellow...'] == 1]
indices = answered.index.map(lambda label: int(label[len('corrections'):]))
surveys = gform.iloc[indices][localplayerguidkey]
classifyAndPlot(['guessed', 'did not'], [surveys])
In [ ]:
classifyAndPlot(['biologist', 'other'], [getSurveysOfBiologists(gform, True)[localplayerguidkey]], title = 'biologists and non-biologists')
In [ ]:
classifyAndPlot(['gamer', 'other'], [getSurveysOfGamers(gform, True)[localplayerguidkey]], title = 'gamers and non-gamers')
In [ ]:
classNames = []
classes = []
for answer in gform['Are you interested in biology?'].value_counts().index:
classNames.append(answer)
classes.append(gform[gform['Are you interested in biology?'] == answer][localplayerguidkey])
classNames.append('other')
classifyAndPlot(classNames, classes, rainbow = True, title = 'interest in biology')
In [ ]:
#np.plot(score)
In [ ]:
classNames = []
classes = []
for thisScore in np.unique(score):
classNames.append(thisScore)
index = np.where(score == thisScore)[0]
classes.append( gform.loc[index][localplayerguidkey])
classifyAndPlot(classNames, classes, rainbow = True, title = 'score')
In [ ]:
classNames = []
classes = []
question = 'How old are you?'
for answer in np.sort(gform[question].unique()):
classNames.append(answer)
classes.append(gform[gform[question] == answer][localplayerguidkey])
classifyAndPlot(classNames, classes, rainbow = True, title = 'age')
In [ ]:
gform.columns[:5]
In [ ]:
# questions to avoid:
#0 Timestamp
#3 Age
#40 Remarks
#41 ID
from itertools import chain
questionRange = chain(range(1,3), range(4,40), range(42,44))
for questionIndex in questionRange:
question = gform.columns[questionIndex]
classNames = []
classes = []
for answer in gform[question].value_counts().index:
classNames.append(answer)
classes.append(gform[gform[question] == answer][localplayerguidkey])
classifyAndPlot(classNames, classes, title = question, rainbow = False)
In [ ]:
eig_vals
In [ ]:
eig_vecs[0]
In [ ]:
maxComponentIndex = np.argmax(abs(eig_vecs[0]))
binarized.columns[maxComponentIndex]
In [ ]:
sum(eig_vecs[0]*eig_vecs[0])
eig_vecs[0]
In [ ]:
sortedIndices = []
descendingWeights = np.sort(abs(eig_vecs[0]))[::-1]
for sortedComponent in descendingWeights:
sortedIndices.append(np.where(abs(eig_vecs[0]) == sortedComponent)[0][0])
sortedQuestions0 = pd.DataFrame(index = descendingWeights, data = binarized.columns[sortedIndices])
sortedQuestions0
In [ ]:
In [ ]:
def accessFirst(a):
return a[0]
sortedQuestionsLastIndex = 10
array1 = np.arange(sortedQuestionsLastIndex+1.)/(sortedQuestionsLastIndex + 1.)
sortedQuestionsLastIndex+1,\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Accent(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Dark2(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Paired(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Pastel1(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Pastel2(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Set1(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Set2(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Set3(array1)))),\
In [ ]:
from matplotlib import cm
def displayQuestionsContributions(\
sortedQuestions,\
title = "Contributions of questions to component",\
sortedQuestionsLastIndex = 10\
):
colors=cm.Set3(np.arange(sortedQuestionsLastIndex+1.)/(sortedQuestionsLastIndex + 1.))
sortedQuestionsLabelsArray = np.append(sortedQuestions.values.flatten()[:sortedQuestionsLastIndex], 'others')
sortedQuestionsValuesArray = np.append(sortedQuestions.index[:sortedQuestionsLastIndex], sum(sortedQuestions.index[sortedQuestionsLastIndex:]))
fig1, ax1 = plt.subplots()
ax1.pie(sortedQuestionsValuesArray, labels=sortedQuestionsLabelsArray, autopct='%1.1f%%', startangle=100, colors = colors)
ax1.axis('equal')
# cf https://matplotlib.org/users/customizing.html
plt.rcParams['patch.linewidth'] = 0
plt.rcParams['text.color'] = '#2b2b2b'
plt.title(title)
plt.tight_layout()
plt.show()
In [ ]:
displayQuestionsContributions(sortedQuestions0, sortedQuestionsLastIndex = 10, title = 'Contributions of questions to component 1')
In [ ]:
sum(sortedQuestions0.index**2)
In [ ]:
sortedIndices = []
descendingWeights = np.sort(abs(eig_vecs[1]))[::-1]
for sortedComponent in descendingWeights:
sortedIndices.append(np.where(abs(eig_vecs[1]) == sortedComponent)[0][0])
sortedQuestions1 = pd.DataFrame(index = descendingWeights, data = binarized.columns[sortedIndices])
sortedQuestions1
In [ ]:
displayQuestionsContributions(sortedQuestions1, sortedQuestionsLastIndex = 10, title = 'Contributions of questions to component 2')
In [ ]:
sum(sortedQuestions1.index**2)